@inproceedings{rahman2019watch,
  title={Watch, listen and tell: Multi-modal weakly supervised dense event captioning},
  author={Rahman, Tanzila and Xu, Bicheng and Sigal, Leonid},
  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
  pages={8908--8917},
  year={2019}
}